Polular open-source NLP library. It's used to perform:
perform topic identification and document comparison https://radimrehurek.com/gensim/
Git Example http://tlfvincent.github.io/2015/10/23/presidential-speech-topics/
In [2]:
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize
In [3]:
my_documents = ['The movie was about a spaceship and aliens.',
'I really liked the movie!',
'Awesome action scenes, but boring characters.',
'The movie was awful! I hate alien films.',
'Space is cool! I liked the movie.',
'More space films, please!',]
In [6]:
tokenized_docs = [word_tokenize(doc.lower())
for doc in my_documents]
dictionary = Dictionary(tokenized_docs)
In [11]:
# Token id
dictionary.token2id
Out[11]:
In [12]:
dictionary[9]
Out[12]:
In [9]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
#The first is the id and second is frequency
corpus
Out[9]:
In [14]:
doc=corpus[4]
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)
bow_doc
Out[14]:
In [16]:
# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
print(dictionary.get(word_id), word_count)
In [21]:
## Create the word freq of all the docs
from collections import defaultdict
from itertools import chain
totalfreq = defaultdict(int)
for word_id, freq in chain.from_iterable(corpus):
totalfreq[word_id] += freq
In [23]:
sorted_freq = sorted(totalfreq.items(), key=lambda w: w[1], reverse=True)
In [24]:
for word_id, word_count in sorted_freq[:5]:
print(dictionary.get(word_id), word_count)